library(tidyverse)
## ── Attaching packages ────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.0 ✔ purrr 0.3.2
## ✔ tibble 2.1.3 ✔ dplyr 0.8.2
## ✔ tidyr 0.8.3 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## Warning: package 'ggplot2' was built under R version 3.5.2
## Warning: package 'tibble' was built under R version 3.5.2
## Warning: package 'tidyr' was built under R version 3.5.2
## Warning: package 'purrr' was built under R version 3.5.2
## Warning: package 'dplyr' was built under R version 3.5.2
## Warning: package 'stringr' was built under R version 3.5.2
## Warning: package 'forcats' was built under R version 3.5.2
## ── Conflicts ───────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(igraph)
## Warning: package 'igraph' was built under R version 3.5.2
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:purrr':
##
## compose, simplify
## The following object is masked from 'package:tidyr':
##
## crossing
## The following object is masked from 'package:tibble':
##
## as_data_frame
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
library(tidygraph)
## Warning: package 'tidygraph' was built under R version 3.5.2
##
## Attaching package: 'tidygraph'
## The following object is masked from 'package:igraph':
##
## groups
## The following object is masked from 'package:stats':
##
## filter
library(ggraph)
## Warning: package 'ggraph' was built under R version 3.5.2
# library(smglr)
library(broom)
## Warning: package 'broom' was built under R version 3.5.2
library(ggforce)
## Warning: package 'ggforce' was built under R version 3.5.2
library(tictoc)
data_folder = '../data/'
plots_folder = '../plots/'
## Load data ----
load(str_c(data_folder, '01_parsed.Rdata'))
There are 203 PhD programs producing graduate students in the data
univ_df %>%
filter(total_placements > 0) %>%
nrow()
## [1] 203
Finding: 37 PhD programs (37/203 = 18%) produce about 50% of permanent placements
ggplot(univ_df, aes(perm_placement_rank, frac_cum_perm_placements)) +
geom_step() +
scale_x_continuous(labels = scales::percent_format(),
name = 'PhD Programs') +
scale_y_continuous(labels = scales::percent_format(),
name = 'Permanent Placements')
## Warning: Removed 23 rows containing missing values (geom_path).
univ_df %>%
filter(frac_cum_perm_placements <= .5) %>%
arrange(perm_placement_rank) %>%
mutate(perm_placement_rank = row_number()) %>%
select(perm_placement_rank, univ_name,
perm_placements, frac_cum_perm_placements) %>%
knitr::kable()
| perm_placement_rank | univ_name | perm_placements | frac_cum_perm_placements |
|---|---|---|---|
| 1 | University of Oxford | 34 | 0.0270056 |
| 2 | Princeton University | 28 | 0.0492454 |
| 3 | Graduate Center of the City University of New York | 25 | 0.0691025 |
| 4 | Columbia University | 25 | 0.0889595 |
| 5 | University of Notre Dame | 23 | 0.1072280 |
| 6 | Rutgers University | 22 | 0.1247021 |
| 7 | Stanford University | 20 | 0.1405878 |
| 8 | Villanova University | 20 | 0.1564734 |
| 9 | Boston College | 19 | 0.1715647 |
| 10 | University of Toronto | 19 | 0.1866561 |
| 11 | University of Chicago | 18 | 0.2009531 |
| 12 | Emory University | 18 | 0.2152502 |
| 13 | Yale University | 18 | 0.2295473 |
| 14 | Katholieke Universiteit Leuven | 18 | 0.2438443 |
| 15 | University of North Carolina at Chapel Hill | 17 | 0.2573471 |
| 16 | Duquesne University | 16 | 0.2700556 |
| 17 | University of Michigan | 16 | 0.2827641 |
| 18 | Baylor University | 16 | 0.2954726 |
| 19 | University of California, Berkeley | 15 | 0.3073868 |
| 20 | University of Wisconsin-Madison | 15 | 0.3193010 |
| 21 | Fordham University | 14 | 0.3304210 |
| 22 | Saint Louis University | 14 | 0.3415409 |
| 23 | University of Pittsburgh | 14 | 0.3526608 |
| 24 | DePaul University | 14 | 0.3637808 |
| 25 | New York University | 14 | 0.3749007 |
| 26 | Stony Brook University | 14 | 0.3860207 |
| 27 | Harvard University | 14 | 0.3971406 |
| 28 | University of California, Riverside | 13 | 0.4074662 |
| 29 | University of South Florida | 13 | 0.4177919 |
| 30 | University of Southern California | 13 | 0.4281176 |
| 31 | Indiana University Bloomington | 13 | 0.4384432 |
| 32 | University of Virginia | 13 | 0.4487689 |
| 33 | Australian National University | 13 | 0.4590945 |
| 34 | University of Edinburgh | 13 | 0.4694202 |
| 35 | University of Pittsburgh (HPS) | 13 | 0.4797458 |
| 36 | Pennsylvania State University | 12 | 0.4892772 |
| 37 | University of Texas at Austin | 12 | 0.4988086 |
## Placements at PhD-producing programs
grad_programs = univ_df %>%
filter(total_placements > 0, country %in% c('U.S.')) %>%
pull(univ_id)
# gp2 = individual_df %>%
# count(placing_univ_id) %>%
# pull(placing_univ_id)
individual_df %>%
filter(permanent, hiring_univ_id %in% grad_programs) %>%
filter(placing_univ_id %in% grad_programs) %>%
filter(position_type == 'Tenure-Track') %>%
count(placing_univ) %>%
arrange(desc(n)) %>%
rename(phd_program_placements = n) %>%
mutate(cum_phd_program_placements = cumsum(phd_program_placements),
share_phd_program_placements = cum_phd_program_placements / sum(phd_program_placements)) %>%
slice(1:20) %>%
knitr::kable(digits = 2)
| placing_univ | phd_program_placements | cum_phd_program_placements | share_phd_program_placements |
|---|---|---|---|
| Princeton University | 12 | 12 | 0.07 |
| Rutgers University | 11 | 23 | 0.13 |
| University of California, Berkeley | 10 | 33 | 0.18 |
| New York University | 8 | 41 | 0.23 |
| University of Chicago | 7 | 48 | 0.27 |
| Yale University | 7 | 55 | 0.31 |
| Harvard University | 6 | 61 | 0.34 |
| University of Arizona | 6 | 67 | 0.37 |
| University of Notre Dame | 6 | 73 | 0.41 |
| Columbia University | 5 | 78 | 0.43 |
| Stanford University | 5 | 83 | 0.46 |
| University of California, Irvine (LPS) | 5 | 88 | 0.49 |
| University of California, Los Angeles | 5 | 93 | 0.52 |
| University of Pittsburgh (HPS) | 5 | 98 | 0.54 |
| Brown University | 4 | 102 | 0.57 |
| Massachusetts Institute of Technology | 4 | 106 | 0.59 |
| University of North Carolina at Chapel Hill | 4 | 110 | 0.61 |
| University of Pittsburgh | 4 | 114 | 0.63 |
| University of Southern California | 4 | 118 | 0.66 |
| University of Wisconsin-Madison | 4 | 122 | 0.68 |
## build network ----
## NB Only permanent placements
hiring_network = individual_df %>%
filter(permanent) %>%
select(placing_univ_id, hiring_univ_id, everything()) %>%
graph_from_data_frame(directed = TRUE,
vertices = univ_df) %>%
as_tbl_graph() %>%
## Clean nodes (universities) that aren't in the network
mutate(degree = centrality_degree(mode = 'total')) %>%
filter(degree > 0)
## 1 giant component contains almost all of the schools
components(hiring_network)$csize
## [1] 737 2 2 4 2 2 1 2 2 2 2 2 1 2
Out- and in-centrality
to_reverse = function (graph) {
## Reverse edge direction
if (!is.directed(graph))
return(graph)
e <- get.data.frame(graph, what="edges")
## swap "from" & "to"
neworder <- 1:length(e)
neworder[1:2] <- c(2,1)
e <- e[neworder]
names(e) <- names(e)[neworder]
vertex_df = get.data.frame(graph, what = 'vertices')
if (ncol(vertex_df) > 0) {
return(as_tbl_graph(graph.data.frame(e, vertices = vertex_df)))
} else {
return(as_tbl_graph(graph.data.frame(e)))
}
}
set.seed(42)
nzmin = function(x, na.rm = TRUE) {
min(x[x > 0], na.rm = na.rm)
}
damping = .3
hiring_network = hiring_network %>%
mutate(in_centrality = centrality_eigen(weights = NA,
directed = TRUE),
in_pr = centrality_pagerank(weights = NA,
directed = TRUE,
damping = damping,
personalized = NULL)) %>%
morph(to_reverse) %>%
mutate(out_centrality = centrality_eigen(weights = NA,
directed = TRUE),
out_pr = centrality_pagerank(weights = NA,
directed = TRUE,
damping = damping,
personalized = NULL)) %>%
unmorph() %>%
## Trim 0s to minimum non-zero values
mutate(out_centrality = ifelse(out_centrality == 0,
nzmin(out_centrality),
out_centrality),
in_centrality = ifelse(in_centrality == 0,
nzmin(in_centrality),
in_centrality))
## Warning: `as_quosure()` requires an explicit environment as of rlang 0.3.0.
## Please supply `env`.
## This warning is displayed once per session.
## Check relationship btwn unweighted multiedges and weighted edges
## Strong correlation, esp by approx rank and high/low prestige
## But some movement, both numerically and ordinally
# E(hiring_network)$weight = count_multiple(hiring_network)
# hiring_network_simp = simplify(hiring_network)
#
# set.seed(42)
# V(hiring_network_simp)$out_centrality = hiring_network_simp %>%
# graph.reverse() %>%
# eigen_centrality(directed = TRUE) %>%
# .$vector
#
# tibble(name = V(hiring_network)$univ_name,
# multi = V(hiring_network)$out_centrality,
# simp = V(hiring_network_simp)$out_centrality) %>%
# ggplot(aes(log10(simp), log10(multi))) +
# geom_point() +
# stat_function(fun = function (x) x)
## exploring centrality scores ----
PageRank centrality is almost entirely determined by degree So we use eigenvector centrality instead
ggplot(hiring_network, aes(degree, log10(out_pr))) +
geom_point(aes(text = univ_name)) +
geom_smooth(method = 'lm')
## Warning: Ignoring unknown aesthetics: text
There are two clear groups of centrality scores, with high scores (in the range of ~10^-12 to 1) and low scores (10^-15 and smaller).
##
## NB there seem to be (small?) differences in scores (at the low end?) across runs of the centrality algorithm
ggplot(as_tibble(hiring_network), aes(out_centrality)) +
geom_density() + geom_rug() +
scale_x_continuous(trans = 'log10',
name = 'Out centrality') +
theme_minimal()
ggsave(str_c(plots_folder, '02_out_centrality.png'),
height = 3, width = 6)
ggplot(as_tibble(hiring_network), aes(in_centrality)) +
geom_density() + geom_rug() +
scale_x_continuous(trans = 'log10')
ggplot(as_tibble(hiring_network),
aes(out_centrality, in_centrality,
color = cluster_label,
text = univ_name)) +
geom_jitter() +
scale_x_log10() + scale_y_log10()
plotly::ggplotly()
# ## Check stability of centrality calculation
# iterated_centrality = 1:500 %>%
# map(~ {hiring_network %>%
# graph.reverse() %>%
# eigen_centrality(directed = TRUE, weights = NA) %>%
# .$vector}) %>%
# transpose() %>%
# map(unlist) %>%
# map(~ tibble(out_centrality = .)) %>%
# bind_rows(.id = 'univ_id') %>%
# group_by(univ_id) %>%
# summarize(min = min(out_centrality),
# max = max(out_centrality),
# median = median(out_centrality))
#
# ## Lots of variation among low-prestige
# ggplot(iterated_centrality,
# aes(reorder(univ_id, median),
# median)) +
# geom_pointrange(aes(ymin = min, ymax = max)) +
# scale_y_log10()
#
# ## But extremely stable results among high-prestige
# iterated_centrality %>%
# filter(min > 10^-12) %>%
# ggplot(aes(reorder(univ_id, median),
# median)) +
# geom_pointrange(aes(ymin = min, ymax = max)) +
# scale_y_log10()
We completely rewire the network, preserving out-degree (number of new PhDs) and in-degree (number of permanent hires), but otherwise randomly matching job-seekers to positions. We then recalculate out-centrality. The plots below show the out-centrality distributions for each random rewiring, with the actual distribution in red. The distributions are always bimodal, indicating that the observed split into high- and low-centrality programs is due in part to the way PhD production and hiring are distributed across institutions. However, the high-centrality subset is never as small as in the actual hiring network. This indicates that bimodiality is not due only to the degree distributions. Some other factor is exacerbating the structural inequality in the hiring network.
set.seed(78910)
map(1:100,
~ sample_degseq(degree(hiring_network, mode = 'out'),
degree(hiring_network, mode = 'in'))
) %>%
map(to_reverse) %>%
map(eigen_centrality, directed = TRUE, weights = NULL) %>%
map(~ .$vector) %>%
map(log10) %>%
map(density) %>%
map(~ tibble(x = .$x, y = .$y)) %>%
bind_rows(.id = 'iteration') %>%
ggplot(aes(x, y)) +
geom_line(aes(group = iteration), alpha = .5) +
stat_density(data = as_tibble(hiring_network), geom = 'line',
aes(log10(out_centrality), ..density..),
color = 'red', size = 1)
Finding: High output is only modestly correlated w/ centrality. Programs like Leuven, New School, and Boston College produce lots of PhDs, but they aren’t placed into the high-centrality departments.
ggplot(as_tibble(hiring_network),
aes(total_placements, log10(out_centrality))) +
geom_point(aes(text = univ_name)) +
geom_smooth()
## Warning: Ignoring unknown aesthetics: text
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 581 rows containing non-finite values (stat_smooth).
## Warning: Removed 581 rows containing missing values (geom_point).
plotly::ggplotly()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 581 rows containing non-finite values (stat_smooth).
# univ_df %>%
# mutate(out_centrality_log = log10(out_centrality)) %>%
# filter(!is.na(out_centrality_log) &
# out_centrality > 0 &
# !is.na(total_placements)) %>%
# select(total_placements, out_centrality_log) %>%
# cor()
hiring_network %>%
select(total_placements, out_centrality) %>%
as_tibble() %>%
filter(complete.cases(.)) %>%
mutate(out_centrality = log10(out_centrality)) %>%
cor()
## total_placements out_centrality
## total_placements 1.000000 0.498529
## out_centrality 0.498529 1.000000
ggplot(hiring_network,
aes(perm_placement_rank, log10(out_centrality))) +
geom_point() +
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 581 rows containing non-finite values (stat_smooth).
## Warning: Removed 581 rows containing missing values (geom_point).
ggplot(hiring_network,
aes(log10(out_centrality), perm_placement_rate)) +
geom_point(aes(text = univ_name)) +
geom_smooth(method = 'lm')
## Warning: Ignoring unknown aesthetics: text
## Warning: Removed 581 rows containing non-finite values (stat_smooth).
## Warning: Removed 581 rows containing missing values (geom_point).
plotly::ggplotly()
## Warning: Removed 581 rows containing non-finite values (stat_smooth).
as_tibble(hiring_network) %>%
select(total_placements, perm_placement_rate,
out_centrality) %>%
filter(complete.cases(.)) %>%
mutate(out_centrality = log10(out_centrality)) %>%
cor()
## total_placements perm_placement_rate out_centrality
## total_placements 1.00000000 -0.01568778 0.498529
## perm_placement_rate -0.01568778 1.00000000 0.189328
## out_centrality 0.49852895 0.18932795 1.000000
## Add centrality scores to univ_df
univ_df = hiring_network %>%
as_tibble() %>%
rename(univ_id = name) %>%
left_join(univ_df, .)
## Joining, by = c("univ_id", "univ_name", "cluster_lvl1", "cluster_lvl2", "cluster_lvl3", "cluster_lvl4", "cluster_label", "city", "state", "country", "total_placements", "perm_placements", "perm_placement_rate", "frac_perm_placements", "cum_perm_placements", "frac_cum_perm_placements", "perm_placement_rank", "aos_diversity", "m_count", "w_count", "o_count", "gender_na_count", "frac_w")
## Movement down the prestige hierarchy
individual_df %>%
filter(permanent) %>%
left_join(select(univ_df, univ_id, out_centrality),
by = c('placing_univ_id' = 'univ_id')) %>%
left_join(select(univ_df, univ_id, out_centrality),
by = c('hiring_univ_id' = 'univ_id')) %>%
# filter(out_centrality.y > 10^-12) %>%
select(person_id, aos_category,
placing = out_centrality.x,
hiring = out_centrality.y) %>%
gather(variable, value, placing, hiring) %>%
mutate(variable = fct_rev(variable)) %>%
ggplot(aes(variable, log10(value))) +
geom_point() +
geom_line(aes(group = person_id),
alpha = .25) +
xlab('university') +
ylab('centrality (log10)') +
theme_minimal()
ggsave(str_c(plots_folder, '02_prestige_movement.png'),
width = 3, height = 4)
## Diagonal line indicates where hiring program has the same centrality as the placing program.
## Most placements are below this line, indicating that the centrality measure captures the idea that people typically are hired by programs with lower status
## The few placements above the line indicate that, even when individuals are hired by programs with higher status, the increase is relatively minor: no more than 1 point on the log10 scale
individual_df %>%
filter(permanent) %>%
left_join(select(univ_df, univ_id, out_centrality),
by = c('placing_univ_id' = 'univ_id')) %>%
left_join(select(univ_df, univ_id, out_centrality),
by = c('hiring_univ_id' = 'univ_id')) %>%
filter(out_centrality.y > 10^-12) %>%
select(person_id,
placing = out_centrality.x,
hiring = out_centrality.y) %>%
ggplot(aes(log10(placing), log10(hiring))) +
geom_jitter() +
stat_function(fun = identity)
## community detection ----
## Extract giant component
hiring_network_gc = hiring_network %>%
components %>%
.$csize %>%
{which(. == max(.))} %>%
{components(hiring_network)$membership == .} %>%
which() %>%
induced_subgraph(hiring_network, .) %>%
as_tbl_graph()
walk_len = rep(2:100, 1)
## ~24 sec
tic()
comm_stats = walk_len %>%
map(~ cluster_walktrap(hiring_network_gc, steps = .x)) %>%
map(~ list(sizes = sizes(.x), length = length(.x))) %>%
map_dfr(~ tibble(H = -sum(.x$sizes/sum(.x$sizes) * log2(.x$sizes/sum(.x$sizes))),
n_comms = .x$length)) %>%
mutate(walk_len = walk_len,
delta_H = log2(n_comms) - H)
toc()
## 22.242 sec elapsed
ggplot(comm_stats, aes(walk_len, H)) +
geom_point() +
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplot(comm_stats, aes(walk_len, n_comms)) +
geom_point() +
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplot(comm_stats, aes(n_comms, delta_H)) +
geom_text(aes(label = walk_len))
Select the walk length that minimizes both delta_H (flatter community distribution) and n_comms (fewer communities) using regression residuals
walk_length = lm(delta_H ~ n_comms, data = comm_stats) %>%
augment(comm_stats) %>%
arrange(.resid) %>%
pull(walk_len) %>%
first()
walk_length
## [1] 17
communities = cluster_walktrap(hiring_network_gc, steps = walk_length)
V(hiring_network_gc)$community = membership(communities)
## Summary of community sizes
hiring_network_gc %>%
as_tibble() %>%
count(community) %>%
pull(n) %>%
summary()
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 4.00 7.50 12.71 13.00 156.00
univ_df = univ_df %>%
left_join({hiring_network_gc %>%
as_tibble() %>%
select(univ_id = name, community) %>%
mutate(community = as.character(community))})
## Joining, by = "univ_id"
univ_df %>%
filter(!is.na(community)) %>%
count(community) %>%
ggplot(aes(n)) +
geom_bar(aes(text = n)) +
scale_x_continuous(name = 'Community size (# programs)')
## Warning: Ignoring unknown aesthetics: text
plotly::ggplotly()
cluster_vars = univ_df %>%
select(matches('cluster')) %>%
names()
univ_df %>%
filter(!is.na(community), !is.na('cluster_lvl4'))
## # A tibble: 737 x 29
## univ_id univ_name cluster_lvl1 cluster_lvl2 cluster_lvl3 cluster_lvl4
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 3002 North D… <NA> <NA> <NA> <NA>
## 2 117 Air Forc… <NA> <NA> <NA> <NA>
## 3 1228 Al-Asmar… <NA> <NA> <NA> <NA>
## 4 353 Albany S… <NA> <NA> <NA> <NA>
## 5 707 Alleghen… <NA> <NA> <NA> <NA>
## 6 385 American… <NA> <NA> <NA> <NA>
## 7 1169 American… <NA> <NA> <NA> <NA>
## 8 1134 American… <NA> <NA> <NA> <NA>
## 9 701 Amherst … <NA> <NA> <NA> <NA>
## 10 3394 Anderson… <NA> <NA> <NA> <NA>
## # … with 727 more rows, and 23 more variables: cluster_label <chr>,
## # city <chr>, state <chr>, country <chr>, total_placements <int>,
## # perm_placements <int>, perm_placement_rate <dbl>,
## # frac_perm_placements <dbl>, cum_perm_placements <int>,
## # frac_cum_perm_placements <dbl>, perm_placement_rank <dbl>,
## # aos_diversity <dbl>, m_count <dbl>, w_count <dbl>, o_count <dbl>,
## # gender_na_count <dbl>, frac_w <dbl>, degree <dbl>,
## # in_centrality <dbl>, in_pr <dbl>, out_centrality <dbl>, out_pr <dbl>,
## # community <chr>
Finding: There is no correlation between semantic clusters and topological communities.
univ_df %>%
filter(!is.na(community), !is.na(cluster_label)) %>%
select(community, cluster_label) %>%
table() %>%
chisq.test(simulate.p.value = TRUE)
##
## Pearson's Chi-squared test with simulated p-value (based on 2000
## replicates)
##
## data: .
## X-squared = 426.35, df = NA, p-value = 0.08846
univ_df %>%
filter(!is.na(community), !is.na(cluster_label)) %>%
count(community, cluster_label) %>%
rename(cluster_n = n) %>%
group_by(community) %>%
mutate(community_tot = sum(cluster_n),
cluster_frac = cluster_n / community_tot,
H = sum(cluster_frac * log2(cluster_frac))) %>%
ungroup() %>%
ggplot(aes(fct_reorder(community, community_tot, .desc = FALSE),
cluster_n, fill = cluster_label)) +
geom_col() +
coord_flip() +
xlab('Topological communities') +
ylab('# of schools in community') +
scale_fill_brewer(palette = 'Set1',
name = 'Semantic\nclusters')
## high-prestige universities
## Start w/ Oxford, and work upstream
## Only need to go 11 or 12 steps to get closure
1:25 %>%
map(~ make_ego_graph(hiring_network, order = .x,
nodes = '512', mode = 'in')) %>%
flatten() %>%
map(~ length(V(.x))) %>%
tibble(order = 1:length(.),
size = unlist(.)) %>%
ggplot(aes(order, size)) + geom_point()
prestigious = make_ego_graph(hiring_network, order = 12,
nodes = '512',
mode = 'in') %>%
.[[1]] %>%
as_tbl_graph()
## How large is the high-prestige community?
## 56 programs; 7% of all programs in the network;
## 28% of programs with at least 1 placement in the dataset
length(V(prestigious))
## [1] 56
length(V(prestigious)) / length(V(hiring_network))
## [1] 0.0733945
length(V(prestigious)) / sum(univ_df$total_placements > 0, na.rm = TRUE)
## [1] 0.2758621
## What fraction of hires are within high-prestige?
## 13% of all permanent hires; 7% of all hires
length(E(prestigious)) / length(E(hiring_network))
## [1] 0.1278793
length(E(prestigious)) / nrow(individual_df)
## [1] 0.06552707
# layout_prestigious = layout_with_focus(prestigious,
# which(V(prestigious)$univ_name == 'University of Oxford')) %>%
# `colnames<-`(c('x', 'y')) %>%
# as_tibble()
layout_prestigious = create_layout(prestigious, 'focus',
focus = which(V(prestigious)$univ_name == 'University of Oxford'))
# png(file = '02_prestigious_net.png',
# width = 11, height = 11, units = 'in', res = 400)
# set.seed(24)
ggraph(layout_prestigious) +
geom_node_label(aes(label = univ_name,
size = log10(out_centrality),
fill = log10(out_centrality)),
color = 'white') +
geom_edge_fan(arrow = arrow(length = unit(.01, 'npc')),
alpha = .25,
strength = 5) +
scale_size_continuous(range = c(.5, 3), guide = FALSE) +
scale_fill_viridis(name = 'out centrality (log10)') +
coord_cartesian(xlim = c(-3.5, 5.5), clip = 'on') +
theme_graph() +
theme(legend.position = 'bottom',
plot.margin = margin(0, unit = 'cm'))
ggsave(str_c(plots_folder, '02_prestigious_net.png'),
width = 6, height = 4, dpi = 600, scale = 2)
# dev.off()
## High-prestige = high centrality group
univ_df = univ_df %>%
mutate(prestige = ifelse(univ_id %in% V(prestigious)$name,
'high-prestige',
'low-prestige'))
V(hiring_network)$prestigious = V(hiring_network)$out_centrality > 1e-12
V(hiring_network_gc)$prestigious = V(hiring_network_gc)$out_centrality > 1e-12
ggplot(univ_df, aes(prestige, log10(out_centrality))) +
geom_jitter()
## Warning: Removed 308 rows containing missing values (geom_point).
## Prestige status and clusters
## High-prestige are spread throughout, but #5 is mostly low-prestige
univ_df %>%
filter(!is.na(cluster_label)) %>%
ggplot(aes(cluster_label, color = prestige)) +
geom_point(stat = 'count') +
geom_line(aes(group = prestige), stat = 'count')
## High-prestige are mostly in the largest community
univ_df %>%
filter(!is.na(community)) %>%
ggplot(aes(as.integer(community), fill = prestige)) +
geom_bar()
## What fraction of high-prestige graduates end up in high-prestige programs?
## 24% of those w/ permanent placements
individual_df %>%
filter(permanent) %>%
left_join(univ_df, by = c('placing_univ_id' = 'univ_id')) %>%
left_join(univ_df, by = c('hiring_univ_id' = 'univ_id')) %>%
select(placing = prestige.x, hiring = prestige.y) %>%
count(placing, hiring) %>%
group_by(placing) %>%
mutate(share = n / sum(n))
## # A tibble: 3 x 4
## # Groups: placing [2]
## placing hiring n share
## <chr> <chr> <int> <dbl>
## 1 high-prestige high-prestige 161 0.239
## 2 high-prestige low-prestige 514 0.761
## 3 low-prestige low-prestige 584 1
Finding: Median permanent placement rate for high-prestige programs is 22 points higher than for low-prestige programs. However, variation is also wide within each group;
This is also not yet controlling for graduation year, area, or demographics.
ggplot(univ_df, aes(prestige, perm_placement_rate,
label = univ_name)) +
geom_sina(aes(size = total_placements),
alpha = .5) +
geom_violin(color = 'red', draw_quantiles = .5,
fill = 'transparent') +
# geom_jitter(aes(size = total_placements)) +
scale_y_continuous(labels = scales::percent_format()) +
ylab('permanent placement rate') +
scale_size(name = 'total placements') +
theme_minimal()
## Warning: Removed 868 rows containing non-finite values (stat_sina).
## Warning: Removed 868 rows containing non-finite values (stat_ydensity).
ggsave(str_c(plots_folder, '02_prestige_placement.png'),
width = 6, height = 4)
## Warning: Removed 868 rows containing non-finite values (stat_sina).
## Warning: Removed 868 rows containing non-finite values (stat_ydensity).
plotly::ggplotly()
## Warning: Removed 868 rows containing non-finite values (stat_sina).
## Warning: Removed 868 rows containing non-finite values (stat_ydensity).
univ_df %>%
group_by(prestige) %>%
summarize_at(vars(perm_placement_rate),
funs(median, max, min), na.rm = TRUE)
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once per session.
## # A tibble: 2 x 4
## prestige median max min
## <chr> <dbl> <dbl> <dbl>
## 1 high-prestige 0.638 1 0.235
## 2 low-prestige 0.417 1 0
By randomly rewiring the network, we can test the stability of the prestige categories to data errors and the short time frame of our data. In each of 500 permutations, we randomly rewire 10% of the edges in the permanent hiring network, then calculate out-centralities on the rewired network. Using a threshold of 10^-12 for high-prestige status, we count the fraction of rewired networks in which each program is high-prestige.
## ~6 sec
tic()
set.seed(13579)
permutations = 1:500 %>%
## Rewire 10% of edges
map(~ {hiring_network %>%
rewire(keeping_degseq(loops = TRUE,
niter = floor(.05 * length(E(.)))))}) %>%
## Calculate out-centralities
map(~ {.x %>%
to_reverse() %>%
eigen_centrality(directed = TRUE, weights = NA) %>%
.$vector}) %>%
transpose() %>%
map(unlist) %>%
## Fraction where program is high-prestige
map(~ sum(. > 10^-12) / length(.)) %>%
map(~ tibble(frac_high_prestige = .)) %>%
bind_rows(.id = 'univ_id')
toc()
## 7.458 sec elapsed
## frac_high_prestige indicates the fraction of permutations in which the program was high-prestige
ggplot(permutations, aes(frac_high_prestige)) +
geom_density() +
geom_rug()
univ_df = left_join(univ_df, permutations)
## Joining, by = "univ_id"
Finding: Counterfactual high-prestige status is strongly correlated with centrality ranking.
ggplot(univ_df, aes(log10(out_centrality), frac_high_prestige)) +
geom_point(aes(text = univ_name)) +
geom_smooth(method = 'lm')
## Warning: Ignoring unknown aesthetics: text
## Warning: Removed 308 rows containing non-finite values (stat_smooth).
## Warning: Removed 308 rows containing missing values (geom_point).
plotly::ggplotly()
## Warning: Removed 308 rows containing non-finite values (stat_smooth).
ggplot(univ_df, aes(perm_placements, frac_high_prestige, color = prestige)) +
geom_point(aes(text = univ_name)) +
geom_smooth(method = 'lm')
## Warning: Ignoring unknown aesthetics: text
## Warning: Removed 889 rows containing non-finite values (stat_smooth).
## Warning: Removed 889 rows containing missing values (geom_point).
plotly::ggplotly()
## Warning: Removed 889 rows containing non-finite values (stat_smooth).
Finding: For low-prestige programs, counterfactual prestige seems to depend on the extent of the program’s downstream hiring network. Compare Boston College (19 permanent placements; 40% high prestige) to Leuven (18 permanent placements; 20% high prestige).
bc_leuven_net = make_ego_graph(hiring_network, order = 10,
nodes = c('529', '38'),
mode = 'out') %>%
map(as_tbl_graph) %>%
reduce(bind_graphs) %>%
mutate(perm_placements = ifelse(is.na(perm_placements), 0, perm_placements))
bc_leuven_layout = create_layout(bc_leuven_net, 'stress')
ggraph(bc_leuven_layout) +
geom_node_label(aes(label = univ_name,
size = perm_placements,
fill = perm_placements),
color = 'white') +
geom_edge_fan(arrow = arrow(angle = 45,
length = unit(.1, 'inches'),
type = 'closed'),
alpha = .3) +
scale_size_continuous(range = c(1, 5),
# na.value = 1,
name = 'placements',
guide = FALSE) +
scale_fill_viridis(na.value = 'black', name = 'permanent\nplacements') +
theme_graph()
ggsave(str_c(plots_folder, '02_bc_leuven.png'),
height = 6, width = 6, dpi = 600, scale = 1.25)
Coarser community structure
# large_comms = which(sizes(communities) > 20)
# V(hiring_network_gc)$community_coarse = ifelse(
# V(hiring_network_gc)$community %in% large_comms,
# V(hiring_network_gc)$community,
# NA)
## Big giant hairy ball
## ~13 sec
# tic()
# layout_net = hiring_network %>%
# layout_with_stress() %>%
# `colnames<-`(c('x', 'y')) %>%
# as_tibble()
# toc()
## Focus on Oxford
## GC only; ~7 sec
# tic()
# layout_net = create_layout(hiring_network_gc, 'focus', focus = 512)
# toc()
## Stress majorization
## ~2 sec
tic()
layout_net = create_layout(hiring_network_gc, 'stress')
toc()
## 1.915 sec elapsed
layout_net %>%
## NB neither of these preserve the layout_tbl_graph class, which breaks things when we get to ggraph()
# filter(total_placements > 0) %>%
# induced_subgraph(which(degree(., mode = 'out') > 0)) %>%
ggraph() +
geom_edge_parallel(arrow = arrow(length = unit(.02, 'npc'),
angle = 15,
type = 'closed'),
# spread = 5,
alpha = .05) +
geom_node_point(aes(#size = log10(out_centrality),
alpha = log10(out_centrality),
# color = as.factor(community)
color = cluster_label
# color = log10(out_centrality)
), size = 2) +
# scale_color_brewer(palette = 'Set1', na.value = 'black') +
scale_color_viridis_d(na.value = 'black', name = 'Semantic cluster') +
# scale_size_discrete(range = c(2, 6)) +
scale_alpha_continuous(range = c(.2, 1),
name = 'Out centrality (log10)') +
# scale_size_continuous(range = c(1, 3)) +
theme_graph()
ggsave(str_c(plots_folder, '02_hairball.png'),
width = 12, height = 12)
# hiring_network_gc %>%
# induced_subgraph(!is.na(V(.)$cluster_lvl1)) %>%
# ggraph() +
# geom_node_label(aes(size = prestigious,
# label = univ_name,
# # color = as.factor(community))) +
# fill = cluster_lvl4), color = 'black') +
# geom_edge_fan(aes(linetype = aos_category, color = aos_category),
# width = 1,
# arrow = arrow(length = unit(.01, 'npc')),
# spread = 5) +
# scale_edge_color_brewer(palette = 'Set1') +
# scale_fill_brewer(palette = 'Set3', na.value = 'grey75') +
# scale_size_discrete(range = c(3, 5)) +
# theme_graph()
Chord diagram
# hiring_network_gc %>%
# # induced_subgraph(which(degree(., mode = 'out') > 0)) %>%
# ggraph(layout = 'linear', sort.by = 'community_coarse', circular = TRUE) +
# geom_edge_arc(arrow = arrow(length = unit(.01, 'npc')), alpha = .1) +
# geom_node_point(aes(size = prestigious,
# # color = as.factor(community))) +
# color = cluster_lvl4)) +
# scale_color_brewer(palette = 'Set1', guide = FALSE) +
# theme_graph()
## Separate networks for each cluster
# cluster_ids = hiring_network %>%
# as_tibble() %>%
# split(., .$cluster_lvl4) %>%
# map(pull, name)
#
# cluster_ids %>%
# map(~induced_subgraph(hiring_network, .))
# hiring_network %>%
# # filter(!is.na(cluster_lvl4)) %>%
# ggraph(layout = 'manual',
# node.positions = layout_net) +
# geom_edge_fan(alpha = .1) +
# geom_node_point(aes(color = cluster_label), show.legend = TRUE) +
# # facet_nodes(vars(cluster_lvl4)) +
# scale_color_viridis_d(na.value = 'grey90') +
# theme_graph()
Save university-level data with network statistics
write_rds(univ_df, str_c(data_folder, '02_univ_net_stats.rds'))
sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS 10.14.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] tictoc_1.0 ggforce_0.3.1 broom_0.5.2 ggraph_2.0.0
## [5] tidygraph_1.1.2 igraph_1.2.4 forcats_0.4.0 stringr_1.4.0
## [9] dplyr_0.8.2 purrr_0.3.2 readr_1.3.1 tidyr_0.8.3
## [13] tibble_2.1.3 ggplot2_3.2.0 tidyverse_1.2.1
##
## loaded via a namespace (and not attached):
## [1] viridis_0.5.1 httr_1.4.0 jsonlite_1.6
## [4] viridisLite_0.3.0 modelr_0.1.4 shiny_1.2.0
## [7] assertthat_0.2.0 highr_0.7 cellranger_1.1.0
## [10] yaml_2.2.0 ggrepel_0.8.0 pillar_1.3.1
## [13] backports_1.1.3 lattice_0.20-38 glue_1.3.1
## [16] digest_0.6.18 RColorBrewer_1.1-2 promises_1.0.1
## [19] polyclip_1.10-0 rvest_0.3.4 colorspace_1.4-0
## [22] htmltools_0.3.6 httpuv_1.4.5.1 pkgconfig_2.0.2
## [25] haven_2.1.0 xtable_1.8-3 scales_1.0.0
## [28] tweenr_1.0.1 later_0.8.0 generics_0.0.2
## [31] farver_1.1.0 ellipsis_0.1.0 withr_2.1.2
## [34] lazyeval_0.2.1 cli_1.1.0 magrittr_1.5
## [37] crayon_1.3.4 readxl_1.3.1 mime_0.6
## [40] evaluate_0.13 fansi_0.4.0 nlme_3.1-137
## [43] MASS_7.3-51.1 xml2_1.2.0 tools_3.5.1
## [46] data.table_1.12.0 hms_0.4.2 plotly_4.8.0
## [49] munsell_0.5.0 compiler_3.5.1 rlang_0.4.0
## [52] grid_3.5.1 rstudioapi_0.10 htmlwidgets_1.3
## [55] crosstalk_1.0.0 labeling_0.3 rmarkdown_1.12
## [58] gtable_0.2.0 graphlayouts_0.5.0 R6_2.4.0
## [61] gridExtra_2.3 lubridate_1.7.4 knitr_1.22
## [64] utf8_1.1.4 stringi_1.4.3 Rcpp_1.0.1
## [67] tidyselect_0.2.5 xfun_0.5